{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f283e6da",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "import os, time, datetime, ast\n",
    "from itertools import product\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "import matplotlib.dates as mdates\n",
    "%matplotlib inline\n",
    "\n",
    "import geopandas as gpd\n",
    "from shapely.geometry import Point, Polygon\n",
    "import geopy.distance as geodist\n",
    "\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import Lasso, LassoCV\n",
    "import pickle\n",
    "\n",
    "os.chdir(\"/Users/xiaosongw/Dropbox/Research/InformedSources/Replication/Build\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "947fe064",
   "metadata": {},
   "source": [
    "# map\n",
    "\n",
    "## identify the missing independent stations\n",
    "\n",
    "generate: add_independent_petrolspy.csv\n",
    "\n",
    "## interpolate prices\n",
    "\n",
    "## Prepare price data for calibration"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "14f7503d",
   "metadata": {},
   "source": [
    "# identify independent stations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d4d68051-7723-4bc1-999d-dae74d23e4da",
   "metadata": {},
   "outputs": [],
   "source": [
    "# ucl shape files\n",
    "gdf0_ucl = gpd.read_file(\"./Input/1270055004_ucl_2016_aust_shape/UCL_2016_AUST.shp\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c7d4e86-f213-45d7-a1d4-a10a0c56381c",
   "metadata": {},
   "outputs": [],
   "source": [
    "l_bid = ['BP', 'Caltex', 'Coles', 'Woolworths', '7-Eleven', 'Other']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b2a0ce68-eda6-4bfc-bf13-0f9f4270163f",
   "metadata": {},
   "outputs": [],
   "source": [
    "def mydis(x):\n",
    "    with pd.option_context('display.max_rows', 100, 'display.max_columns', 100):\n",
    "        display(x)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d9f77158-b4c4-4841-835e-c4f99b0371bf",
   "metadata": {},
   "source": [
    "## Informed Sources"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "732bb2f0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# read informed sources dataset\n",
    "# new sample from 2015-2018\n",
    "df_mel = pd.read_stata(\"./Output/is_mel_p_2005_2019.dta\").rename(columns={'address1':'address'})\n",
    "df_mel = df_mel[(df_mel['t']>='2015-05-01')&(df_mel['t']<='2017-12-31')].sort_values(['id', 't'], ignore_index=True).copy()\n",
    "print('number of stations: ', df_mel.loc[df_mel['p'].notnull(), 'id'].nunique())\n",
    "print('start date: ', df_mel['t'].min(), 'end date: ', df_mel['t'].max())\n",
    "df_mel['d0'] = df_mel['id'].map(df_mel[df_mel['p'].notnull()].groupby('id')['t'].first().to_dict())\n",
    "df_mel['d1'] = df_mel['id'].map(df_mel[df_mel['p'].notnull()].groupby('id')['t'].last().to_dict())\n",
    "df_mel['pfill'] = df_mel.groupby('id')['p'].ffill()\n",
    "df_mel['dp2'] = df_mel.groupby('id')['pfill'].diff()\n",
    "df_mel['is_jump2'] = (df_mel['dp2']>5).astype(int)\n",
    "df_mel['t_range'] = (df_mel['d1'] - df_mel['d0']).dt.days\n",
    "df_mel['n_jump2'] = df_mel.groupby('id')['is_jump2'].transform('sum')\n",
    "df_mel['dur'] = df_mel['t_range'] / df_mel['n_jump2']\n",
    "df_mel['nocyc'] = (df_mel['dur']>60).astype(int)\n",
    "l_st_nocyc = df_mel.loc[df_mel['nocyc']==1, 'id'].unique().tolist()\n",
    "print('number of non-cycling stations: {}'.format(len(l_st_nocyc)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7fcb827a-0aac-427b-a84f-03845dfe6c78",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_is_st = df_mel[df_mel['p'].notnull()].groupby(['id', 'coor'])[\n",
    "    ['brand', 'bid', 'sitename', 'address', 'manual']].last().reset_index()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "da5f8197-dfc8-4239-aa95-0e0b65ca7f04",
   "metadata": {},
   "outputs": [],
   "source": [
    "# IS brand share in melbourne\n",
    "print('IS stations in melbourne: ', df_is_st.shape)\n",
    "out_is = df_is_st.groupby('bid')[['id']].count()\n",
    "out_is['share'] = out_is / out_is.sum(axis=0)\n",
    "out_is.loc[l_bid]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3eb4c309-cd82-4fbf-8b8e-52dba34defce",
   "metadata": {},
   "source": [
    "## Petrolspy"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "af8832dc-7187-460a-92e0-c672fbf4529f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ps = pd.read_csv(\"./Output/petrolspy_stations_mel.csv\")\n",
    "df_ps['coors'] = gpd.points_from_xy(df_ps.lng, df_ps.lat)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0084ca8d-bc96-4ecc-b9d1-bb39ce223127",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(df_ps['address'].nunique())\n",
    "print(df_ps.shape)\n",
    "df_ps['bid'] = 'Other'\n",
    "df_ps.loc[df_ps['name'].str.contains('Coles'), 'bid'] = 'Coles'\n",
    "df_ps.loc[df_ps['brand']=='SEVENELEVEN', 'bid'] = '7-Eleven'\n",
    "df_ps.loc[df_ps['brand']=='EG', 'bid'] = 'Woolworths'\n",
    "df_ps.loc[df_ps['brand']=='AMPOL', 'bid'] = 'Caltex'\n",
    "df_ps.loc[df_ps['brand']=='CALTEX', 'bid'] = 'Caltex'\n",
    "df_ps.loc[df_ps['brand']=='BP', 'bid'] = 'BP'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a12bf45c-f90d-4934-b5af-c5942e1f59a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "out_ps = df_ps.groupby('bid')[['address']].nunique()\n",
    "out_ps['shr'] = out_ps / out_ps.sum(axis=0)\n",
    "out_ps.loc[l_bid]"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2a9ebfaa-0b62-4b82-8c2c-df1029632b7c",
   "metadata": {},
   "source": [
    "## find ps indepedent stations that are not in the Informed Sources dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec7fdb8a-239f-4357-aee5-6aea64ab4035",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_is_st['lat'] = df_is_st['coor'].apply(lambda x: ast.literal_eval(x)[0])\n",
    "df_is_st['lng'] = df_is_st['coor'].apply(lambda x: ast.literal_eval(x)[1])\n",
    "df_is_st['coors'] = gpd.points_from_xy(df_is_st.lng, df_is_st.lat)\n",
    "df_is_st = df_is_st.set_geometry('coors')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8461ec79-1d5f-4103-a16e-c9c395f8f694",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ps_oth = df_ps[df_ps['bid']=='Other'].copy()\n",
    "df_ps_oth['coor'] = df_ps_oth['coors'].apply(lambda x: '({}, {})'.format(x.y, x.x))\n",
    "df_ps_oth.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "70f53c41-3771-4fe7-9a98-7a4015605ebf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# find distances\n",
    "df_dist = pd.DataFrame(product(df_ps_oth['id'].unique(), df_is_st['id'].unique().tolist()), \n",
    "                       columns=['id_ps', 'id_is'])\n",
    "df_dist = df_dist.merge(df_is_st[['id', 'coor']].rename(columns={'id':'id_is', 'coor':'coor_is'}), \n",
    "                        on=['id_is'], how='left').merge(\n",
    "    df_ps_oth[['id', 'coor']].rename(columns={'id':'id_ps', 'coor':'coor_ps'}), on='id_ps', how='left')\n",
    "print(df_dist.shape)\n",
    "df_dist.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "09fd7651-d363-41b9-b4b8-cf81bd224282",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "df_dist['dist'] = df_dist[['coor_ps', 'coor_is']].apply(\n",
    "        lambda x: geodist.distance(ast.literal_eval(x[0]), ast.literal_eval(x[1])).m, axis=1)\n",
    "df_dist.sort_values(['id_ps', 'dist'], ignore_index=True, inplace=True)\n",
    "\n",
    "df_dist['add_is'] = df_dist['id_is'].map(df_is_st.set_index('id')['address'])\n",
    "df_dist['add_ps'] = df_dist['id_ps'].map(df_ps_oth.set_index('id')['address'])\n",
    "df_dist['bid_is'] = df_dist['id_is'].map(df_is_st.set_index('id')['brand'])\n",
    "df_dist['bid_ps'] = df_dist['id_ps'].map(df_ps_oth.set_index('id')['brand'].to_dict())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cc97fd1c-be73-4975-bebc-c41df56d22cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_dist['rank'] = df_dist.groupby('id_ps')['dist'].rank(method='first')\n",
    "df_dist_min = df_dist[df_dist['rank']==1].sort_values('dist', ignore_index=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e23041b9-1ee3-47e5-be07-62d941f90f9b",
   "metadata": {},
   "outputs": [],
   "source": [
    "l_is_ind = df_is_st.loc[df_is_st['bid']=='Other', 'id'].unique().tolist()\n",
    "l_is_ind_not_found_in_ps = [i for i in l_is_ind if i not in df_dist_min.loc[df_dist_min['dist']<=50, 'id_is'].unique().tolist()]\n",
    "print('{} independent stations in informed sources not found matches in petrolspy'.format(len(l_is_ind_not_found_in_ps)))\n",
    "display(df_dist_min[df_dist_min['id_is'].isin(l_is_ind_not_found_in_ps)].head(2))\n",
    "df_dist_min['match'] = (df_dist_min['dist']<50).astype(int)\n",
    "df_dist_min.loc[df_dist_min['id_is'].isin([61303207, 61377770]), 'match'] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a75ef1ce-4df4-42d4-a164-a8bce9fa2e1f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_dist_min['match'] = (df_dist_min['dist']<50).astype(int)\n",
    "df_dist_min.loc[df_dist_min['id_is'].isin([61303207, 61377770]), 'match'] = 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2c04a7c1-f644-40bb-811d-863fc83a6bee",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_add_oth = df_dist_min.loc[df_dist_min['match']==0].rename(\n",
    "    columns={'id_ps':'id', 'coor_ps':'coor', 'add_ps':'address', 'bid_ps':'brand'})\n",
    "df_add_oth = df_add_oth[['id', 'coor', 'address', 'brand']].copy()\n",
    "df_add_oth.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "511e40e8-1840-469a-9ccf-4c58e12f16cd",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all_st_mel = pd.concat([df_is_st[['id', 'coor', 'address', 'brand']], \n",
    "                           df_add_oth[['id', 'coor', 'address', 'brand']]], axis=0)\n",
    "df_all_st_mel.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "37890c51-a69c-4d08-b55a-b0227e57c80b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_all_st_mel.to_csv(\"./Output/st_mel_full.csv\", index=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "25d8eecc-0ba2-4bc5-bb6f-7a8be084d73f",
   "metadata": {},
   "source": [
    "# Machine Learning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f460de7f-8156-447e-b0bc-2312bfbb7fa1",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_c = pd.read_stata(\"./Input/tgp_daily.dta\")\n",
    "d_c = df_c[df_c['capital']=='MELBOURNE'].set_index('date')['c'].to_dict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05cf2c19-c153-466d-84ce-7803bfa314a1",
   "metadata": {},
   "outputs": [],
   "source": [
    "l_id_sample = df_all_st_mel.loc[df_all_st_mel['brand'].str.lower()!='costco', 'id'].unique().tolist()\n",
    "print(len(l_id_sample))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1190424f-21d4-416a-b819-1b8c43079b3b",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ml = pd.DataFrame(product(l_id_sample, df_mel['t'].unique()), columns=['id', 't']).copy()\n",
    "df_ml = df_ml.merge(df_mel[['t', 'id', 'bid', 'brand', 'p']], on=['id', 't'], how='left')\n",
    "df_ml['c'] = df_ml['t'].map(d_c)\n",
    "df_ml['_brand'] = df_ml['id'].map(df_add_oth.set_index('id')['brand'].to_dict())\n",
    "df_ml.loc[df_ml['id']<10000000, 'brand'] = df_ml['_brand']\n",
    "df_ml.loc[df_ml['id']<10000000, 'bid'] = 'Other'\n",
    "df_ml.drop('_brand', axis=1, inplace=True)\n",
    "df_ml['brand'] = df_ml['brand'].str.title()\n",
    "df_ml.loc[df_ml['brand']=='', 'brand'] = np.nan\n",
    "df_ml['bn'] = df_ml['bid'].astype(str).copy()\n",
    "df_ml.loc[df_ml['brand']=='United', 'bn'] = 'United'\n",
    "df_ml.loc[df_ml['brand']=='Puma Energy', 'bn'] = 'Puma'\n",
    "df_ml.loc[df_ml['brand']=='Liberty', 'bn'] = 'Liberty'\n",
    "df_ml = df_ml[~df_ml['id'].isin(l_st_nocyc)].copy()\n",
    "print(df_ml.id.nunique())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "01ff3d9b-fa2a-482d-b6bc-eef1570744e0",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_st_sample = df_all_st_mel.loc[df_all_st_mel['brand'].str.lower()!='costco', ['id', 'coor']].drop_duplicates()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c4ac1e0b-eb7b-427e-bbb8-dcacaf406883",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "df_ndist = pd.DataFrame(product(df_ml.id.unique(), \n",
    "                               df_is_st.loc[(df_is_st['brand']!='Costco')&(~df_is_st['id'].isin(l_st_nocyc)), 'id'].unique()), \n",
    "                       columns=['id', 'id_is'])\n",
    "df_ndist = df_ndist.merge(df_st_sample[['id', 'coor']], on='id', how='left').merge(\n",
    "        df_is_st[['id', 'coor']].rename(columns={'id':'id_is', 'coor':'coor_is'}), on='id_is', how='left')\n",
    "\n",
    "df_ndist['dist'] = df_ndist[['coor', 'coor_is']].apply(\n",
    "        lambda x: geodist.distance(ast.literal_eval(x[0]), ast.literal_eval(x[1])).m, axis=1)\n",
    "df_ndist.sort_values(['id', 'dist'], ignore_index=True, inplace=True)\n",
    "df_ndist.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b43f7d3-40d2-4aec-9298-d5a3af9d8c02",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_ndist = df_ndist[df_ndist['dist']>0].copy()\n",
    "df_ndist['rank'] = df_ndist[~df_ndist['id_is'].isin(l_st_nocyc)].groupby('id')['dist'].rank(method='dense')\n",
    "df_ndist['nbid'] = df_ndist['id_is'].map(df_ml.groupby('id')['bid'].last())\n",
    "df_ndist['nbid'] = df_ndist['nbid'].astype(str)\n",
    "df_ndist.loc[df_ndist['nbid']=='Other', 'nbid'] = df_ndist['id_is'].astype(str)\n",
    "df_ndist.head(2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b713b86b-3959-4fbb-bea1-1005614144a5",
   "metadata": {},
   "outputs": [],
   "source": [
    "tmp = pd.DataFrame()\n",
    "for i in l_id_sample:\n",
    "    _l_n = df_ndist.loc[(df_ndist['id']==i)&(df_ndist['rank']<=5), 'id_is'].unique().tolist()\n",
    "    tmp1 = df_mel[df_mel['id'].isin(_l_n)].groupby('t')['p'].mean().reset_index()\n",
    "    tmp1['p'] = tmp1['p'].interpolate(limit=5)\n",
    "    tmp1['dp'] = tmp1['p'].diff()\n",
    "    tmp1['p_pos'] = tmp1['p'] * (tmp1['dp'] > 0).astype(int)\n",
    "    tmp1['p_neg'] = tmp1['p'] * (tmp1['dp'] <= 0).astype(int)\n",
    "    tmp1['id'] = i\n",
    "    tmp = pd.concat([tmp, tmp1], axis=0)\n",
    "    \n",
    "    \n",
    "if 'avgp5' not in df_ml.columns:\n",
    "    df_ml = df_ml.merge(tmp.rename(columns={'p':'avgp5', 'p_pos':'avgp5_pos', 'p_neg':'avgp5_neg'}), \n",
    "                        on=['id', 't'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55479ebb-78f6-4eb0-a2d1-e26bd73cd0ea",
   "metadata": {},
   "outputs": [],
   "source": [
    "# hhi\n",
    "tmp = df_ndist[df_ndist['dist']<5000].groupby(['id', 'nbid'])['id_is'].count().reset_index()\n",
    "tmp['other'] = 0\n",
    "tmp.loc[~tmp['nbid'].isin(l_bid), 'other'] = 1\n",
    "tmp['id_is2'] = tmp['id_is']**2\n",
    "tmp1 = tmp.groupby('id')[['id_is', 'other']].sum()\n",
    "df_ml['hhi5km'] = df_ml['id'].map(tmp.groupby('id')['id_is2'].sum() / tmp.groupby('id')['id_is'].sum()**2) \n",
    "df_ml['n5km'] = df_ml['id'].map(df_ndist[df_ndist['dist']<5000].groupby('id')['id_is'].count()) \n",
    "df_ml['ndist'] = df_ml['id'].map(df_ndist[df_ndist['rank']==1].set_index('id')['dist'])\n",
    "df_ml['dist200m'] = (df_ml['ndist'] < 200).astype(int)\n",
    "df_ml['sind5km'] = df_ml['id'].map(tmp1['other']/tmp1['id_is'])\n",
    "# df_ml['avgp5f'] = df_ml.groupby('id', group_keys=False)[['avgp5']].apply(lambda x: x.interpolate(method='index'))\n",
    "\n",
    "# coles price \n",
    "l_id_coles = df_mel.loc[(df_mel['t']>='2016-04-20')&(df_mel['t']<='2016-06-01')\n",
    "           &(df_mel['bid']=='Coles')&(df_mel['p'].notnull()), 'id'].unique().tolist()\n",
    "print('{} coles stations prices show up in the entire sample'.format(len(l_id_coles)))\n",
    "df_ml['p_coles'] = df_ml['t'].map(df_mel[df_mel['id'].isin(l_id_coles)].groupby('t')['p'].mean())\n",
    "df_ml['p_coles'] = df_ml.groupby('id', group_keys=False)[['p_coles']].apply(lambda x: x.interpolate(method='index'))\n",
    "\n",
    "# BP price\n",
    "l_id_bp = df_mel.loc[(df_mel['t']>='2016-06-01')&(df_mel['t']<='2016-12-31')\n",
    "           &(df_mel['bid']=='BP')&(df_mel['p'].notnull()), 'id'].unique().tolist()\n",
    "print('{} BP stations prices show up in the entire sample'.format(len(l_id_bp)))\n",
    "df_ml['p_bp'] = df_ml['t'].map(df_mel[df_mel['id'].isin(l_id_bp)].groupby('t')['p'].mean())\n",
    "df_ml['p_bp'] = df_ml.groupby('id', group_keys=False)[['p_bp']].apply(lambda x: x.interpolate(method='index'))\n",
    "\n",
    "for i in range(0, 8):\n",
    "    df_ml['avgp5_L'+str(i)] = df_ml.groupby('id')['avgp5'].shift(i)\n",
    "    # df_ml['avgp5_pos_L'+str(i)] = df_ml.groupby('id')['avgp5_pos'].shift(i)\n",
    "    # df_ml['avgp5_neg_L'+str(i)] = df_ml.groupby('id')['avgp5_neg'].shift(i)\n",
    "    df_ml['p_coles_L'+str(i)] = df_ml.groupby('id')['p_coles'].shift(i)\n",
    "    df_ml['p_bp_L'+str(i)] = df_ml.groupby('id')['p_bp'].shift(i)\n",
    "    df_ml['c_L'+str(i)] = df_ml.groupby('id')['c'].shift(i)\n",
    "\n",
    "df_ml['United'] = ((df_ml['bid']=='Other')&(df_ml['brand']=='United')).astype(int)\n",
    "df_ml['Liberty'] = ((df_ml['bid']=='Other')&(df_ml['brand']=='Liberty')).astype(int)\n",
    "df_ml['Other'] = ((df_ml['bid']=='Other')&(df_ml['brand']!='Liberty')&(df_ml['brand']!='United')).astype(int)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a07c4212-7d57-4cde-8f6e-7d2514171fa5",
   "metadata": {},
   "source": [
    "# Independent Stations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6b0c8203-583f-4cef-b8dc-3823be6308d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# independent stations\n",
    "df_ml_ind = df_ml.loc[df_ml['bid']=='Other', ['id', 't', 'brand', 'p'] + ['United', 'Liberty', 'Other'] + \n",
    "                                          ['avgp5', 'c', 'hhi5km', 'n5km', 'sind5km', 'dist200m'] + \n",
    "                                          ['avgp5_L'+str(i) for i in range(1, 8)] + \n",
    "                                          ['c_L'+str(i) for i in range(1, 8)]].copy()\n",
    "\n",
    "l_col = df_ml_ind.columns\n",
    "# import warnings\n",
    "# with warnings.catch_warnings(record=True):\n",
    "for i in range(4, len(l_col)):\n",
    "    for j in range(i, len(l_col)):\n",
    "        tmp = (df_ml_ind.iloc[:, i] * df_ml_ind.iloc[:, j])\n",
    "        tmp.name = l_col[i]+'_'+l_col[j]\n",
    "        df_ml_ind = pd.concat([df_ml_ind, tmp], axis=1)\n",
    "mydis(df_ml_ind.head(2)) \n",
    "print('{} independent stations'.format(df_ml_ind.id.nunique()))\n",
    "\n",
    "in_ind = df_ml_ind[~df_ml_ind.isnull().any(axis=1)].sort_values(['id', 't'], ignore_index=True).copy()\n",
    "X, y = in_ind.iloc[:, 4:], in_ind.iloc[:, 3]\n",
    "print(X.shape, y.shape)\n",
    "print('{} independent stations'.format(in_ind.id.nunique()))\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e49cbc55-46ad-43c2-9c38-8c03665bad05",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)\n",
    "print(X_train.shape)\n",
    "print(X_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "83d89e47-5781-499b-9a7a-c0574d11373a",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "from sklearn.linear_model import Lasso\n",
    "\n",
    "pipe = make_pipeline(StandardScaler(with_mean=False), Lasso(alpha=1, max_iter=1000000, random_state=0))\n",
    "pipe_ind = pipe.fit(X_train, y_train)  # apply scaling on training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8fbd3b75-6d26-4413-aba2-011c9948bee6",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pipe_ind.score(X_train, y_train))\n",
    "print(pipe_ind.score(X_test, y_test))\n",
    "print(X_train.columns[pipe_ind.named_steps['lasso'].coef_>0].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "215ba009-7ecb-45d4-93bb-3b06f5c3511e",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "from sklearn.linear_model import LassoCV\n",
    "pipe = make_pipeline(StandardScaler(with_mean=False), \n",
    "                     LassoCV(alphas=[0.0002*2**i for i in range(8)],  cv=5, n_jobs=-1, max_iter=1000000, random_state=0))\n",
    "pipe_ind = pipe.fit(X_train, y_train)  # apply scaling on training data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21d3dc2e-b981-4070-8724-0efef1dc1a4b",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pipe_ind.score(X_train, y_train))\n",
    "print(pipe_ind.score(X_test, y_test))\n",
    "print(pipe_ind.named_steps['lassocv'].alpha_)\n",
    "print(X_train.columns[pipe_ind.named_steps['lassocv'].coef_>0].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b7b025e2-a119-433b-86d6-db32b5e884e7",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = './Temp/ind_model.sav'\n",
    "pickle.dump(pipe_ind, open(filename, 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "28bfda6a",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "from sklearn.linear_model import RidgeCV\n",
    "pipe = make_pipeline(StandardScaler(), \n",
    "                     RidgeCV(alphas=[0.01, 0.1, 1, 10], cv=5))\n",
    "pipe_ind = pipe.fit(X_train, y_train) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4b100745",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pipe_ind.score(X_train, y_train))\n",
    "print(pipe_ind.score(X_test, y_test))\n",
    "print(pipe_ind.named_steps['ridgecv'].alpha_)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2dccadec-6ea3-45d5-b999-13f263d68fb5",
   "metadata": {},
   "source": [
    "### model fit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "0cac6cdb-187e-4738-bfc7-b4121701b911",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out_ind = df_ml_ind[['id', 't', 'p', 'avgp5']].copy()\n",
    "df_out_ind.loc[~df_ml_ind[X_train.columns].isnull().any(axis=1), 'p_hat'] = pipe_ind.predict(\n",
    "    df_ml_ind.loc[~df_ml_ind[X_train.columns].isnull().any(axis=1), X_train.columns])\n",
    "df_out_ind['dp_hat'] = df_out_ind.groupby('id')['p_hat'].diff()\n",
    "df_out_ind['_p_max'] = df_out_ind.groupby('id')['p_hat'].rolling(window=11, center=True).max().reset_index(0,drop=True)\n",
    "df_out_ind['peak'] = ((df_out_ind['_p_max'] == df_out_ind['p_hat'])&(df_out_ind['dp_hat']>0)).astype(int)\n",
    "df_out_ind['cyc_id'] = df_out_ind.groupby('id')['peak'].cumsum()\n",
    "df_out_ind['cyc_day'] = df_out_ind.groupby(['id', 'cyc_id']).cumcount()\n",
    "df_out_ind['cyc_len'] = df_out_ind.groupby(['id', 'cyc_id'])['cyc_day'].transform('max')\n",
    "df_out_ind['cyc_dcl'] = np.nan\n",
    "for i in range(10):\n",
    "    df_out_ind.loc[(df_out_ind['cyc_day']>=(i/10*df_out_ind['cyc_len']))\n",
    "                  &(df_out_ind['cyc_day']<((i+1)/10*df_out_ind['cyc_len'])), 'cyc_dcl']= i\n",
    "df_out_ind.loc[df_out_ind['cyc_day']==df_out_ind['cyc_len'], 'cyc_dcl'] = 9"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9f6fa653-af62-4862-ab7c-bdc83c8868a7",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out_ind.to_csv(\"./Output/app_lasso_ind_p.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b8c7b011-e878-41ae-8e8a-023b9c78fb21",
   "metadata": {},
   "outputs": [],
   "source": [
    "fig, axs = plt.subplots()\n",
    "df_out_ind[(df_out_ind['id']>1000000)\n",
    "           &(df_out_ind['t']<'2016-05-01')].groupby('cyc_dcl')['p'].describe()[['25%', '50%', '75%']].plot(\n",
    "    ax=axs, c='grey')\n",
    "df_out_ind[(df_out_ind['id']>1000000)\n",
    "           &(df_out_ind['t']<'2016-05-01')].groupby('cyc_dcl')['p_hat'].describe()[['25%', '50%', '75%']].plot(\n",
    "    ax=axs, c='C3', ls='--')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9e8e27d8-4ca0-492a-9a2c-02696fadcae0",
   "metadata": {},
   "source": [
    "## Coles"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5c31723e-3cd8-4412-9b7d-86fa295bbc16",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# independent stations\n",
    "df_ml_coles = df_ml.loc[df_ml['bid']=='Coles', ['id', 't', 'brand', 'p'] + \n",
    "                        ['p_coles'] + ['p_coles_L'+str(i) for i in range(1, 4)] +\n",
    "                        ['avgp5', 'c', 'hhi5km', 'n5km', 'sind5km', 'dist200m'] + \n",
    "                        ['avgp5_L'+str(i) for i in range(1, 4)] + \n",
    "                        ['c_L'+str(i) for i in range(1, 4)]].copy()\n",
    "\n",
    "l_col = df_ml_coles.columns\n",
    "# import warnings\n",
    "# with warnings.catch_warnings(record=True):\n",
    "for i in range(4, len(l_col)):\n",
    "    for j in range(i, len(l_col)):\n",
    "        tmp = (df_ml_coles.iloc[:, i] * df_ml_coles.iloc[:, j])\n",
    "        tmp.name = l_col[i]+'_'+l_col[j]\n",
    "        df_ml_coles = pd.concat([df_ml_coles, tmp], axis=1)\n",
    "mydis(df_ml_coles.head(2)) \n",
    "print('{} Coles stations'.format(df_ml_coles.id.nunique()))\n",
    "\n",
    "in_coles = df_ml_coles[~df_ml_coles.isnull().any(axis=1)].sort_values(['id', 't'], ignore_index=True).copy()\n",
    "X_coles, y_coles = in_coles.iloc[:, 4:], in_coles.iloc[:, 3]\n",
    "print(X_coles.shape, y_coles.shape)\n",
    "print('{} Coles stations'.format(in_ind.id.nunique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2b4f80fc-45e9-4f07-8753-3076eaaa9f24",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_coles_train, X_coles_test, y_coles_train, y_coles_test = train_test_split(X_coles, y_coles, test_size=0.2, random_state=0)\n",
    "print(X_coles_train.shape)\n",
    "print(X_coles_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d34fb0c3-ce15-4201-9047-9399cfa2df8e",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "pipe = make_pipeline(StandardScaler(with_mean=False), Lasso(alpha=0.1, max_iter=1000000, random_state=0))\n",
    "pipe_coles = pipe.fit(X_coles_train, y_coles_train)  # apply scaling on training data\n",
    "print(pipe_coles.score(X_coles_train, y_coles_train))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "197b7d22-253b-47a9-bfd6-cfa8d4f56b33",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(X_coles_train.columns[pipe_coles.named_steps['lasso'].coef_>0].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "21e7c515-466e-4b17-a3a9-f245d6530a3d",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "pipe = make_pipeline(StandardScaler(with_mean=False), \n",
    "            LassoCV(alphas=[0.0001*2**i for i in range(8)], cv=5, n_jobs=-1, \n",
    "                    max_iter=1000000, random_state=0, selection='random'))\n",
    "pipe_coles = pipe.fit(X_coles_train, y_coles_train)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c9375382-e458-4fe2-8d5c-e85f547c8c0d",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pipe_coles.score(X_coles_train, y_coles_train))\n",
    "print(pipe_coles.score(X_coles_test, y_coles_test))\n",
    "print(pipe_coles.named_steps['lassocv'].alpha_)\n",
    "print(X_coles_train.columns[pipe_coles.named_steps['lassocv'].coef_>0].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32b98905",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "pipe = make_pipeline(StandardScaler(), \n",
    "            RidgeCV(alphas=[0.01, 0.1, 1, 10], cv=5))\n",
    "pipe_coles = pipe.fit(X_coles_train, y_coles_train) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a4cd0573",
   "metadata": {},
   "outputs": [],
   "source": [
    "pipe_coles.named_steps['ridgecv'].alphas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b034e33e",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pipe_coles.score(X_coles_train, y_coles_train))\n",
    "print(pipe_coles.score(X_coles_test, y_coles_test))\n",
    "print(pipe_coles.named_steps['ridgecv'].alpha_)\n",
    "print(X_coles_train.columns[pipe_coles.named_steps['ridgecv'].coef_>0].tolist())\n",
    "print(np.sum(pipe_coles.named_steps['ridgecv'].coef_>0))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3493af37-e4ed-4d7b-addd-09f94414f7df",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pickle\n",
    "filename = './Temp/coles_model.sav'\n",
    "pickle.dump(pipe_coles, open(filename, 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6f619585-6b89-4641-b653-aef1103fb919",
   "metadata": {},
   "outputs": [],
   "source": [
    "loaded_model = pickle.load(open(filename, 'rb'))\n",
    "result = loaded_model.score(X_coles_test, y_coles_test)\n",
    "print(result)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "81418712-56d8-45de-9c09-efce08d6e145",
   "metadata": {},
   "source": [
    "## BP"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e479e5ca-5948-4087-9aa2-70a32f090456",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "# independent stations\n",
    "df_ml_bp = df_ml.loc[df_ml['bid']=='BP', ['id', 't', 'brand', 'p'] + \n",
    "                        ['p_bp'] + ['p_bp_L'+str(i) for i in range(1, 4)] +\n",
    "                        ['avgp5', 'c', 'hhi5km', 'n5km', 'sind5km', 'dist200m'] + \n",
    "                        ['avgp5_L'+str(i) for i in range(1, 4)] + \n",
    "                        ['c_L'+str(i) for i in range(1, 4)]].copy()\n",
    "\n",
    "l_col = df_ml_bp.columns\n",
    "# import warnings\n",
    "# with warnings.catch_warnings(record=True):\n",
    "for i in range(4, len(l_col)):\n",
    "    for j in range(i, len(l_col)):\n",
    "        tmp = (df_ml_bp.iloc[:, i] * df_ml_bp.iloc[:, j])\n",
    "        tmp.name = l_col[i]+'_'+l_col[j]\n",
    "        df_ml_bp = pd.concat([df_ml_bp, tmp], axis=1)\n",
    "mydis(df_ml_bp.head(2)) \n",
    "print('{} bp stations'.format(df_ml_bp.id.nunique()))\n",
    "\n",
    "in_bp = df_ml_bp[~df_ml_bp.isnull().any(axis=1)].sort_values(['id', 't'], ignore_index=True).copy()\n",
    "X_bp, y_bp = in_bp.iloc[:, 4:], in_bp.iloc[:, 3]\n",
    "print(X_bp.shape, y_bp.shape)\n",
    "print('{} bp stations'.format(in_ind.id.nunique()))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3e4f1b0b-bf7a-49fb-8088-ce67ad595d26",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "X_bp_train, X_bp_test, y_bp_train, y_bp_test = train_test_split(X_bp, y_bp, test_size=0.3, random_state=0)\n",
    "print(X_bp_train.shape)\n",
    "print(X_bp_test.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7a66889c-61ef-41d8-ba48-7cbc0a21937f",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "pipe = make_pipeline(StandardScaler(with_mean=False), Lasso(alpha=0.1, max_iter=1000000, random_state=0))\n",
    "pipe_bp = pipe.fit(X_bp_train, y_bp_train)  # apply scaling on training data\n",
    "print(pipe_bp.score(X_bp_train, y_bp_train))\n",
    "print(X_bp_train.columns[pipe_bp.named_steps['lasso'].coef_>0].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ab2abcac-0aed-4070-8c64-bbf0ad379d38",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "pipe = make_pipeline(StandardScaler(with_mean=False), \n",
    "            LassoCV(alphas=[0.0001*2**i for i in range(8)], cv=5, n_jobs=-1, \n",
    "                    max_iter=1000000, random_state=0, selection='random'))\n",
    "pipe_bp = pipe.fit(X_bp_train, y_bp_train)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "93a51a1b-b365-4506-989b-5042d5c3ad44",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pipe_bp.score(X_bp_train, y_bp_train))\n",
    "print(pipe_bp.score(X_bp_test, y_bp_test))\n",
    "print(pipe_bp.named_steps['lassocv'].alpha_)\n",
    "print(X_bp_train.columns[pipe_bp.named_steps['lassocv'].coef_>0].tolist())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed2460d1-7f0d-44dd-8652-e02c446f670c",
   "metadata": {},
   "outputs": [],
   "source": [
    "filename = './Temp/bp_model.sav'\n",
    "pickle.dump(pipe_bp, open(filename, 'wb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f3e4a90b-50fb-4412-a42f-97e1813ce9ca",
   "metadata": {},
   "outputs": [],
   "source": [
    "%%time\n",
    "pipe = make_pipeline(StandardScaler(with_mean=False), \n",
    "            RidgeCV(alphas=[0.01, 0.1, 1, 10], cv=5))\n",
    "pipe_bp = pipe.fit(X_bp_train, y_bp_train)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e4d1d57e",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(pipe_bp.score(X_bp_train, y_bp_train))\n",
    "print(pipe_bp.score(X_bp_test, y_bp_test))\n",
    "print(pipe_bp.named_steps['ridgecv'].alpha_)\n",
    "print(X_bp_train.columns[pipe_bp.named_steps['ridgecv'].coef_>0].tolist())\n",
    "print(np.sum(pipe_bp.named_steps['ridgecv'].coef_>0))"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "3854c1c1-3948-410a-9ed6-e9018d0d4432",
   "metadata": {},
   "source": [
    "# Data Preparation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dcd38df0-6e63-4a8d-ab22-b57d504cf8ad",
   "metadata": {},
   "outputs": [],
   "source": [
    "pipe_ind = pickle.load(open('./Temp/ind_model.sav', 'rb'))\n",
    "pipe_coles = pickle.load(open('./Temp/coles_model.sav', 'rb'))\n",
    "pipe_bp = pickle.load(open('./Temp/bp_model.sav', 'rb'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "32f2d2ca-50ba-4284-bc43-897e3ac397ff",
   "metadata": {},
   "outputs": [],
   "source": [
    "print((pipe_ind.named_steps['lassocv'].coef_>0).sum())\n",
    "print((pipe_coles.named_steps['lassocv'].coef_>0).sum())\n",
    "print((pipe_bp.named_steps['lassocv'].coef_>0).sum())\n",
    "print(pipe_ind.score(X_test, y_test))\n",
    "print(pipe_coles.score(X_coles_test, y_coles_test))\n",
    "print(pipe_bp.score(X_bp_test, y_bp_test))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9bcd2084-0a2d-47c0-b737-36101c55c509",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out_ind = df_ml_ind[['id', 't', 'p', 'avgp5']].copy()\n",
    "df_out_ind.loc[~df_ml_ind[X_train.columns].isnull().any(axis=1), 'p_hat'] = pipe_ind.predict(\n",
    "    df_ml_ind.loc[~df_ml_ind[X_train.columns].isnull().any(axis=1), X_train.columns])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "80305a02-6d94-4bd0-a005-cc8592898514",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out_coles = df_ml_coles[['id', 't', 'p', 'avgp5']].copy()\n",
    "df_out_coles.loc[~df_ml_coles[X_coles_train.columns].isnull().any(axis=1), 'p_hat'] = pipe_coles.predict(\n",
    "    df_ml_coles.loc[~df_ml_coles[X_coles_train.columns].isnull().any(axis=1), X_coles_train.columns])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a196b8a3-e8de-4268-8ae6-253d4441305c",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out_bp = df_ml_bp[['id', 't', 'p', 'avgp5']].copy()\n",
    "df_out_bp.loc[~df_ml_bp[X_bp_train.columns].isnull().any(axis=1), 'p_hat'] = pipe_bp.predict(\n",
    "    df_ml_bp.loc[~df_ml_bp[X_bp_train.columns].isnull().any(axis=1), X_bp_train.columns])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "e89ed2ed-f594-4377-9ba0-41d5e39ae2d1",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_out_ml = pd.concat([df_out_ind, df_out_coles, df_out_bp], axis=0).copy()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "3ed4ebb2-6368-4c64-b7b3-9029fd4283f9",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_calb = df_ml[['id', 't', 'bid', 'brand', 'p', 'c', 'avgp5']].copy()\n",
    "df_calb = df_calb.merge(df_out_ml[['id', 't', 'p_hat']], on=['id', 't'], how='left')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1df0ab0e-3233-4418-bd1e-913138acb8e6",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_calb['pf'] = df_calb['p'].copy()\n",
    "df_calb.loc[df_calb['p'].isnull(), 'pf'] = df_calb['p_hat']\n",
    "df_calb['d0'] = df_calb['id'].map(df_calb[df_calb['p'].notnull()].groupby('id')['t'].min())\n",
    "df_calb['d1'] = df_calb['id'].map(df_calb[df_calb['p'].notnull()].groupby('id')['t'].max())\n",
    "\n",
    "# ind\n",
    "l_ind_exited = df_mel.loc[(df_mel['bid']=='Other')&(df_mel['d1']<'2016-04-01'), 'id'].unique().tolist()\n",
    "print('{} independent stations exited the market'.format(len(l_ind_exited)))\n",
    "\n",
    "# BP \n",
    "l_bp = df_mel.loc[(df_mel['bid']=='BP'), 'id'].unique().tolist()\n",
    "l_bp_n = df_mel.loc[(df_mel['bid']=='BP')&(df_mel['d1']=='2016-05-23'), 'id'].unique().tolist()\n",
    "print('{} BP stations prices were no longer collected after 2016-05-23'.format(len(l_bp_n)))\n",
    "\n",
    "df_calb.loc[(df_calb['bid'].isin(['Caltex', '7-Eleven', 'Woolworths']))&(df_calb['pf'].isnull()), \n",
    "            'pf'] = df_calb.groupby('id')['p'].ffill(limit=3)\n",
    "\n",
    "df_calb.loc[(df_calb['id'].isin(l_bp))&(~df_calb['id'].isin(l_bp_n))&(df_calb['t']>df_calb['d1']), 'pf'] = np.nan\n",
    "df_calb.loc[(df_calb['id'].isin(l_ind_exited))&(df_calb['t']>df_calb['d1']), 'pf'] = np.nan\n",
    "df_calb.loc[(df_calb['t']<df_calb['d0'])&(df_calb['d0']>'2015-06-01'), 'pf'] = np.nan"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e461880-8a5c-4e9a-bd76-36d484829665",
   "metadata": {},
   "outputs": [],
   "source": [
    "for i in range(len(l_bid)):\n",
    "    df_calb['b'+str(i+1)] = (df_calb['bid'] == l_bid[i]).astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "91e22b4f-4cfe-4782-8ba1-fbe908d8a836",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_calb[['id', 't', 'bid', 'p', 'c', 'pf', 'b1', 'b2', 'b3', 'b4', 'b5', 'b6']].to_csv(\n",
    "    \"./Output/p_in.csv\", index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ae9d1105-a1e5-4337-9ea5-d73bc41dc27c",
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "cfe27d48-3c98-48f6-8c23-53a3b15fd9d2",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.18"
  },
  "toc": {
   "base_numbering": 1,
   "nav_menu": {},
   "number_sections": true,
   "sideBar": true,
   "skip_h1_title": false,
   "title_cell": "Table of Contents",
   "title_sidebar": "Contents",
   "toc_cell": false,
   "toc_position": {},
   "toc_section_display": true,
   "toc_window_display": false
  },
  "toc-autonumbering": true,
  "toc-showcode": true
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
